################################################################################
## Create figure A2-A3
## from NCDPS_describing_omitted_records.R and NC_DPSetup_November20pull.R
################################################################################
library(tidyverse)

NC_dataloss <- read_csv("nc_dataloss_describe.csv")

# prepare the plot
race_tab <- NC_dataloss %>% 
  select(c("row","prop_black","prop_white","prop_asian","prop_native","prop_unknown")) %>%
  mutate(across(prop_black:prop_unknown, ~round(.x,  3))) %>% 
  mutate(row = ifelse(row=="Status unclear","Felony status unclear",row))

black <- race_tab %>% select(row,prop_black) %>% mutate(race = "5") %>% rename("value" = "prop_black")
white <- race_tab %>% select(row,prop_white) %>% mutate(race = "4") %>% rename("value" = "prop_white")
asian <- race_tab %>% select(row,prop_asian) %>% mutate(race = "1") %>% rename("value" = "prop_asian")
native <- race_tab %>% select(row,prop_native) %>% mutate(race = "2") %>% rename("value" = "prop_native")
unknown <- race_tab %>% select(row,prop_unknown) %>% mutate(race = "3") %>% rename("value" = "prop_unknown")

d <- as.data.frame(rbind(black, white,asian,native,unknown))
d <- d %>% mutate(row_num = case_when(row=="Universe" ~ "01",
                                      row== "Under supervision"~"02",
                                      row=="Deceased" ~ "03",
                                      row=="Felony status unclear"~"04",
                                      row=="Duplicates"~"05",
                                      row=="Over 70"~"06",
                                      row=="No last name"~"07",
                                      row=="Citizens"~"08",
                                      row=="Used in pilots"~"09",
                                      row=="Final: Addresses found"~"10"))

plot <- ggplot(d, aes(fill=race, y=value, x=row_num)) + 
  geom_bar(position="stack", stat="identity")+
  scale_fill_discrete(name = " ", labels = c("Asian","Unknown","Native","White","Black"))+
  scale_x_discrete(breaks=c("01","02","03","04","05","06","07","08","09","10"),
                    labels=c("Universe","Under supervision","Deceased",
                            "Felony status unclear","Duplicates",
                            "Over 70","No last name","Citizens","Used in pilots",
                            "Final: Addresses found"))+
  theme_bw(base_size=16, base_family="Times")+
  theme(panel.border=element_rect(fill=NA, colour=NA), 
        legend.title = element_blank(),
        panel.grid.major=element_line(colour="grey95"),
        panel.grid.minor=element_line(colour=NA))+
  theme(strip.text.x = element_text(size = 16))+
  theme(strip.background = element_rect(fill="white"))+
  labs(y = "Percent", x = " ", title = c(" ", cex=1))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1.1)) # use vjust and hjust

ggsave(plot, filename = "race_data_lost.png", width = 9, height = 5)

## gender
gender_tab <- NC_dataloss %>% 
  select(c("row","prop_male","prop_female")) %>%
  mutate(across(c(prop_male, prop_female), ~round(.x,  3))) %>% 
  mutate(row = ifelse(row=="Status unclear","Felony status unclear",row))
  
male <- gender_tab %>% select(row,prop_male) %>% mutate(race = "Male") %>% rename("value" = "prop_male")
female <- gender_tab %>% select(row,prop_female) %>% mutate(race = "Female") %>% rename("value" = "prop_female")

d <- as.data.frame(rbind(male,female))
d <- d %>% mutate(row_num = case_when(row=="Universe" ~ "01",
                                      row== "Under supervision"~"02",
                                      row=="Deceased" ~ "03",
                                      row=="Felony status unclear"~"04",
                                      row=="Duplicates"~"05",
                                      row=="Over 70"~"06",
                                      row=="No last name"~"07",
                                      row=="Citizens"~"08",
                                      row=="Used in pilots"~"09",
                                      row=="Final: Addresses found"~"10"))

plot<-ggplot(d, aes(fill=race, y=value, x=row_num)) + 
  geom_bar(position="stack", stat="identity")+
  scale_x_discrete(breaks=c("01","02","03","04","05","06","07","08","09","10"),
                    labels=c("Universe","Under supervision","Deceased",
                            "Felony status unclear","Duplicates",
                            "Over 70","No last name","Citizens","Used in pilots","Final: Addresses found"))+
  theme_bw(base_size=16, base_family="Times")+
  theme(panel.border=element_rect(fill=NA, colour=NA), 
        legend.title = element_blank(),
        panel.grid.major=element_line(colour="grey95"),
        panel.grid.minor=element_line(colour=NA))+
  theme(strip.text.x = element_text(size = 16))+
  theme(strip.background = element_rect(fill="white"))+
  labs(y = "Percent", x = " ", title = c(" ", cex=1))+
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1.1)) # use vjust and hjust

ggsave(plot, filename = "gender_data_lost.png", width = 9, height = 5)

# mean age

d <- NC_dataloss %>% select(c("row","mean_age")) %>%
  mutate(row = ifelse(row=="Status unclear","Felony status unclear",row))

d <- d %>% mutate(row_num = case_when(row=="Universe" ~ "01",
                                      row== "Under supervision"~"02",
                                      row=="Deceased" ~ "03",
                                      row=="Felony status unclear"~"04",
                                      row=="Duplicates"~"05",
                                      row=="Over 70"~"06",
                                      row=="No last name"~"07",
                                      row=="Citizens"~"08",
                                      row=="Used in pilots"~"09",
                                      row=="Final: Addresses found"~"10"))

plot<-ggplot(d, aes(y=mean_age, x=row_num)) + 
  geom_point()+scale_x_discrete(breaks=c("01","02","03","04","05","06","07","08","09","10"),
                   labels=c("Universe","Under supervision","Deceased","Felony status unclear","Duplicates",
                            "Over 70","No last name","Non-citizens","Used in pilots","Final: Addresses found"))+
  theme_bw(base_size=16, base_family="Times")+
  theme(panel.border=element_rect(fill=NA, colour=NA), 
        panel.grid.major=element_line(colour="grey95"),
        panel.grid.minor=element_line(colour=NA))+
  theme(strip.text.x = element_text(size = 16))+
  theme(strip.background = element_rect(fill="white"))+
  ylim(40,53)+
  labs(y="Mean age",x=" ",
       title=c(" ", cex=1))+
  theme(axis.text.x = element_text(angle = 45))

ggsave(plot, filename = "age_data_lost.png",width = 9, height = 5)

# time since release

d <- NC_dataloss %>% select(c("row","mean_release_time")) %>% 
  mutate(row = ifelse(row=="Status unclear","Felony status unclear",row))

d <- d %>% mutate(row_num = case_when(row=="Universe" ~ "01",
                                      row== "Under supervision"~"02",
                                      row=="Deceased" ~ "03",
                                      row=="Felony status unclear"~"04",
                                      row=="Duplicates"~"05",
                                      row=="Over 70"~"06",
                                      row=="No last name"~"07",
                                      row=="Citizens"~"08",
                                      row=="Used in pilots"~"09",
                                      row=="Final: Addresses found"~"10"))


plot<-ggplot(d, aes(y=mean_release_time, x=row_num)) + 
  geom_point()+scale_x_discrete(breaks=c("01","02","03","04","05","06","07","08","09","10"),
                                labels=c("Universe","Under supervision","Deceased","Felony status unclear","Duplicates",
                                         "Over 70","No last name","Non-citizens","Used in pilots","Final: Addresses found"))+
  theme_bw(base_size=16, base_family="Times")+
  theme(panel.border=element_rect(fill=NA, colour=NA), 
        panel.grid.major=element_line(colour="grey95"),
        panel.grid.minor=element_line(colour=NA))+
  theme(strip.text.x = element_text(size = 16))+
  theme(strip.background = element_rect(fill="white"))+
  ylim(8,20)+
  labs(y="Mean years \nsince release",x=" ",
       title=c(" ", cex=1))+
  theme(axis.text.x = element_text(angle = 45))

ggsave(plot, filename = "release_data_lost.png",width = 9, height = 5)

